In [1]:
################################new section classification
# import warnings
# warnings.filterwarnings("ignore")

import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.corpus import wordnet
nltk.download('omw-1.4')
import numpy as np
from sklearn.model_selection import train_test_split

import pickle

from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument("--n", help="SDG K")
# args = parser.parse_args()
#
# number = int(args.n)
def get_category(x):
    if x == "Strongly Misaligned":
        return "Misaligned"
    elif x == "Strongly Aligned":
        return "Aligned"
    else:
        return x


def stem_sentences(x):
    tokenized_words = x.split(" ")
    tokenized_sentence = []
    for word in tokenized_words:
        if len(wordnet.synsets(word)) != 0:
            tokenized_sentence.append(porter.stem(word))
    tokenized_sentence = " ".join(tokenized_sentence)
    return tokenized_sentence

# creating bag of words representations from description
# Create a Bag of Words Model with Sklearn
# import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_BoW(df_wiki_node, column_name, param1=5, param2=.99):
    corpus = df_wiki_node[column_name]
    # sentence_1="*&^$This is a good job.{{I will not miss it for anything"
    # sentence_2="This is not good at all}}, hello my name misses a w"

#     CountVec = CountVectorizer(ngram_range=(1,2), # to use bigrams ngram_range=(2,2)
#                                stop_words='english')
    CountVec = CountVectorizer(min_df=param1,max_df=param2, ngram_range=(1, 2), stop_words='english')
    #transform
    Count_data = CountVec.fit_transform(corpus.values.tolist())

    #create dataframe
    BoW_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
    # print(BoW_dataframe)
    return BoW_dataframe


import wordninja
def split_words(x):
    return " ".join(wordninja.split(x))

def lower_string(x):
    x = x.replace("&"," ")
    return x.lower()

from pattern.text.en import singularize 
def clean_text(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    # nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == "JJ" or pos == "JJR" or pos == "JJS")]
    # nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == "JJ" or pos == "JJR" or pos == "JJS" or pos[0] == "V")]
    try:
        # nouns = [word for word,pos in tags if (pos[0] == "N" or pos[0] == "J" or pos[1] == "V")]
        # nouns = [word for word,pos in tags if (pos[0] == "N" or pos[0] == "J")]
        nouns = []
        for word,pos in tags:
            if pos[0] == "N" and len(word) > 3:
                nouns.append(singularize(word))
            elif pos[0] == "N" and len(word) <= 3:
                nouns.append(word)
        return " ".join(nouns).lower()
    except:
        return "nothing"
# df_merge = pd.read_csv("./results_news/features4.csv")
column_features = ['magnitude_sum', 'magnitude_mean', 'magnitude_std',
       'magnitude_median', 'magnitude_var', 'magnitude_amin', 'magnitude_amax',
       'magnitude_percentile_5', 'magnitude_percentile_95',
       'magnitude_percentile_10', 'magnitude_percentile_90', 'score_sum',
       'score_mean', 'score_std', 'score_median', 'score_var', 'score_amin',
       'score_amax', 'score_percentile_5', 'score_percentile_95',
       'score_percentile_10', 'score_percentile_90', 'numMentions_sum',
       'numMentions_mean', 'numMentions_std', 'numMentions_median',
       'numMentions_var', 'numMentions_amin', 'numMentions_amax',
       'numMentions_percentile_5', 'numMentions_percentile_95',
       'numMentions_percentile_10', 'numMentions_percentile_90',
       'avgSalience_sum', 'avgSalience_mean', 'avgSalience_std',
       'avgSalience_median', 'avgSalience_var', 'avgSalience_amin',
       'avgSalience_amax', 'avgSalience_percentile_5',
       'avgSalience_percentile_95', 'avgSalience_percentile_10',
       'avgSalience_percentile_90', 'overall_score_sum', 'overall_score_mean',
       'overall_score_std', 'overall_score_median', 'overall_score_var',
       'overall_score_amin', 'overall_score_amax',
       'overall_score_percentile_5', 'overall_score_percentile_95',
       'overall_score_percentile_10', 'overall_score_percentile_90']
# number = 1
all_scores_all_SDGs = []
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qhuca\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
In [2]:
number = 13


df = pd.read_csv("./data/msci3.csv")
df1 = df[["Company Name", "Company ID", '01. No Poverty', '02. Zero Hunger', '03. Good Health & Wellbeing', '04. Quality Education', '05. Gender Equality Description', '06. Clean Water and Sanitation', '07. Affordable Clean Energy', '08. Decent Work and Economic Growth', '09. Industry Innovation and Infrastructure', '10. Reduced Inequalities', '11. Sustainable Cities and Communities', '12. Responsible Consumption and Production', '13. Climate Action', '14. Life Below Water', '15. Life on Land', '16. Peace, Justice & Strong Institutions', '17. Partnership for the Goals']].fillna('0')
df1 = df1.rename(columns = {"Company Name": "company"})
df1 = df1.replace('\xa0', '0')
df1 = df1.replace(' ', '0')

all_sdgs = ['01. No Poverty', '02. Zero Hunger', '03. Good Health & Wellbeing', '04. Quality Education', '05. Gender Equality Description', '06. Clean Water and Sanitation', '07. Affordable Clean Energy', '08. Decent Work and Economic Growth', '09. Industry Innovation and Infrastructure', '10. Reduced Inequalities', '11. Sustainable Cities and Communities', '12. Responsible Consumption and Production', '13. Climate Action', '14. Life Below Water', '15. Life on Land', '16. Peace, Justice & Strong Institutions', '17. Partnership for the Goals']
sdg = all_sdgs[number-1]
df_label = df1[["company",sdg]]
variable6 = "GICS Sector"
variable5 = sdg

print("SDG ", number, " is calculating ...... ")
# msci = pd.read_csv("./data/msci.csv")
# msci2 = pd.read_csv("./data/msci2.csv").rename(columns={"SDG_03_OPS_ALIGNMENT":"SDG_03_OPER_ALIGNMENT"})

# variable6 = "GICS Industry"

# if number >= 10:
#     variable5 = "SDG_{}_PROD_ALIGNMENT".format(number)
# else:
#     variable5 = "SDG_0{}_PROD_ALIGNMENT".format(number) # another thing

# SDG1 = msci[["Company Name", "Company ID"]].dropna()
# SDG2 = msci2[["ISSUER_NAME", "Figi", variable5]].dropna()

# df_label = SDG1.merge(SDG2, left_on="Company ID", right_on="Figi")[["Company Name", variable5]]
# df_label = df_label.rename(columns = {"Company Name": "company"})

df_sector = pd.read_csv("./data/Fundamental.csv")[["Company Name",variable6]].rename(columns={"Company Name": "company"})
# df_merge2 = df_merge.merge(df_sector,on="company", how="right")
df_merge3 = df_sector.merge(df_label)

# added
df_wiki = pd.read_csv("./temp_data/wiki/wiki_product_info.csv",sep="\t")
df_merge3 = df_merge3.merge(df_wiki[["company","product_info"]],on="company").dropna()

# encoded_dict = {"Strongly Misaligned":0,'Misaligned':1,"Neutral":2,"Aligned":3,"Strongly Aligned":4}
# df_merge3[variable5] = df_merge3[variable5].map(encoded_dict)



df_merge3["merged_text"] = df_merge3[variable6] + " " + df_merge3["product_info"] 
# features = df_merge3["product_info"] 
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(split_words)
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(lower_string)
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(clean_text)
df_merge3 = df_merge3[df_merge3.merged_text.str.len()>100]

tmp = df_merge3[variable5].value_counts().reset_index()
index = tmp[tmp[sdg]==1]["index"].values
df_merge3 = df_merge3[~df_merge3[variable5].isin(index)]

features = df_merge3["merged_text"]
labels = df_merge3[variable5]
SDG  13  is calculating ...... 
100%|█████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:08<00:00, 147.91it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1217/1217 [00:00<00:00, 304054.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:31<00:00, 38.86it/s]
In [3]:
y = labels
# X = pd.concat([features1.reset_index(drop=True),features4.reset_index(drop=True)], axis=1)
X = features
In [4]:
from sklearn import preprocessing
# Encode for string labels
label_encoder = preprocessing.LabelEncoder().fit(y)
y = label_encoder.transform(y)
In [5]:
all_unique_labels = pd.Series(y).unique()
all_unique_labels.sort()
In [6]:
pd.Series(y).value_counts()
Out[6]:
3    1074
2      28
6      27
0      26
1      20
5      12
4       4
dtype: int64
In [7]:
labels.value_counts()
Out[7]:
0     1074
-3      28
3       27
-1      26
-2      20
2       12
1        4
Name: 13. Climate Action, dtype: int64
In [8]:
from sklearn.model_selection import train_test_split
import xgboost
import sklearn
# import lightgbm as lgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(y_train.shape, y_test.shape)
(952,) (239,)
In [9]:
pd.Series(y_test).value_counts()
Out[9]:
3    216
2      6
0      5
6      5
1      4
5      2
4      1
dtype: int64
In [10]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = "english", ngram_range=(1,2), max_df=0.7, min_df=0.01)
# , max_df=0.7, min_df=0.05
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
In [11]:
from sklearn.base import clone, BaseEstimator, ClassifierMixin
class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:,1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[i-1][:,1] - clfs_predict[i][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i-1][:,1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
In [12]:
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train_vectors, y_train)

nb = MultinomialNB(alpha=.01)
nb.fit(X_resampled, y_resampled)

# nb = OrdinalClassifier(nb)
# nb.fit(X_resampled, y_resampled)
Out[12]:
MultinomialNB(alpha=0.01)
In [13]:
y_pred = nb.predict(test_vectors)
sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
Out[13]:
0.8708906156604902
In [14]:
# Explaining predictions using lime
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)
In [15]:
print(c.predict_proba([X_test.values[8]]).round(3))
[[0.01 0.   0.   0.99 0.   0.   0.  ]]
In [16]:
class_names = all_unique_labels

from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
In [17]:
examples = []
for element in class_names:
    A = np.where(np.equal(y_pred, y_test))[0]
    B = np.where(np.equal(y_test, element))[0]
    mask = np.in1d(A, B)
    try:
        examples.append(A[np.where(mask)[0]][0:3])
    except:
        continue
In [18]:
from itertools import chain
examples = list(chain.from_iterable(examples))
In [19]:
pd.Series(y).value_counts()
Out[19]:
3    1074
2      28
6      27
0      26
1      20
5      12
4       4
dtype: int64
In [20]:
labels.value_counts()
Out[20]:
0     1074
-3      28
3       27
-1      26
-2      20
2       12
1        4
Name: 13. Climate Action, dtype: int64
In [21]:
for element in examples:
    idx=element
    exp = explainer.explain_instance(X_test.values[idx], c.predict_proba, num_features=6)
    print('Document id: %d' % idx)
    print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
    print('True class: %s' % class_names[y_test[idx]])
    exp = explainer.explain_instance(X_test.values[idx], c.predict_proba, num_features=6, top_labels=2)
    print('Most possible two classes: %s' % exp.available_labels())
    exp.show_in_notebook(text=True)
Document id: 6
Predicted class = 2
True class: 2
Most possible two classes: [2, 0]
Document id: 0
Predicted class = 3
True class: 3
Most possible two classes: [3, 1]
Document id: 1
Predicted class = 3
True class: 3
Most possible two classes: [3, 0]
Document id: 2
Predicted class = 3
True class: 3
Most possible two classes: [3, 0]
Document id: 172
Predicted class = 6
True class: 6
Most possible two classes: [6, 3]
In [22]:
# print ('Explanation for class %s' % class_names[0])
# print ('\n'.join(map(str, exp.as_list(label=0))))
# print ()
# print ('Explanation for class %s' % class_names[2])
# print ('\n'.join(map(str, exp.as_list(label=2))))
# print ()
# print ('Explanation for class %s' % class_names[4])
# print ('\n'.join(map(str, exp.as_list(label=4))))
In [23]:
# exp.show_in_notebook(text=True)
In [24]:
# exp.show_in_notebook(text=X_test.values[idx], labels=(0,))
In [ ]:
 
In [25]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
In [26]:
from lime import submodular_pick
sp_obj = submodular_pick.SubmodularPick(explainer, X_train.values, c.predict_proba, sample_size=500, num_features=10, num_exps_desired=5)
#Plot the 5 explanations
# [exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_obj.sp_explanations];
In [27]:
# Make it into a dataframe
W_pick=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.sp_explanations]).fillna(0)
 
W_pick['prediction'] = [this.available_labels()[0] for this in sp_obj.sp_explanations]
 
#Making a dataframe of all the explanations of sampled points
W=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.explanations]).fillna(0)
W['prediction'] = [this.available_labels()[0] for this in sp_obj.explanations]
In [28]:
#Plotting the aggregate importances
np.abs(W.drop("prediction", axis=1)).mean(axis=0).sort_values(ascending=False).head(
    25
).sort_values(ascending=True).iplot(kind="barh")
 
#Aggregate importances split by classes
grped_coeff = W.groupby("prediction").mean()
 
grped_coeff = grped_coeff.T
grped_coeff["abs"] = np.abs(grped_coeff.iloc[:, 0])
grped_coeff.sort_values("abs", inplace=True, ascending=False)
grped_coeff.head(25).sort_values("abs", ascending=True).drop("abs", axis=1).iplot(
    kind="barh", bargap=0.5
) 
In [29]:
pd.Series(y).value_counts()
Out[29]:
3    1074
2      28
6      27
0      26
1      20
5      12
4       4
dtype: int64
In [30]:
labels.value_counts()
Out[30]:
0     1074
-3      28
3       27
-1      26
-2      20
2       12
1        4
Name: 13. Climate Action, dtype: int64
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Out[31]:
array([[  0,   0,   0,   5,   0,   0,   0],
       [  0,   0,   0,   4,   0,   0,   0],
       [  1,   0,   1,   3,   0,   0,   1],
       [  1,   0,   1, 213,   0,   0,   1],
       [  0,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   2,   0,   0,   0],
       [  0,   0,   0,   4,   0,   0,   1]], dtype=int64)